In [1]:
from sklearn import datasets, linear_model, metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
import math, scipy, numpy as np
from scipy import linalg

In [2]:
data_set = datasets.load_diabetes()

In [3]:
x_trn,x_tst,y_trn,y_tst = train_test_split(data_set.data,data_set.target,test_size=0.2)

In [5]:
x_trn.shape,x_tst.shape,y_trn.shape,y_tst.shape


Out[5]:
((353, 10), (89, 10), (353,), (89,))

In [7]:
feature_names=['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [6]:
lr = linear_model.LinearRegression()

In [8]:
def regr_metrics(act, pred):
    return (math.sqrt(metrics.mean_squared_error(act, pred)), 
     metrics.mean_absolute_error(act, pred))

In [9]:
%timeit lr.fit(x_trn,y_trn)


479 µs ± 27.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

In [10]:
pred = lr.predict(x_tst)

In [11]:
regr_metrics(y_tst,pred)


Out[11]:
(55.36037159904751, 45.62943684445233)

Polynomial Features


In [12]:
poly = PolynomialFeatures(include_bias=False)
trn_feat = poly.fit_transform(x_trn)

In [13]:
', '.join(poly.get_feature_names(feature_names))


Out[13]:
'age, sex, bmi, bp, s1, s2, s3, s4, s5, s6, age^2, age sex, age bmi, age bp, age s1, age s2, age s3, age s4, age s5, age s6, sex^2, sex bmi, sex bp, sex s1, sex s2, sex s3, sex s4, sex s5, sex s6, bmi^2, bmi bp, bmi s1, bmi s2, bmi s3, bmi s4, bmi s5, bmi s6, bp^2, bp s1, bp s2, bp s3, bp s4, bp s5, bp s6, s1^2, s1 s2, s1 s3, s1 s4, s1 s5, s1 s6, s2^2, s2 s3, s2 s4, s2 s5, s2 s6, s3^2, s3 s4, s3 s5, s3 s6, s4^2, s4 s5, s4 s6, s5^2, s5 s6, s6^2'

In [14]:
trn_feat.shape


Out[14]:
(353, 65)

In [15]:
lr.fit(trn_feat,y_trn)


Out[15]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [17]:
regr_metrics(y_tst,lr.predict(poly.fit_transform(x_tst)))


Out[17]:
(60.437875212161885, 50.116067121238906)

In [25]:
%timeit poly.fit_transform(x_trn)


769 µs ± 52.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

Speed upthe computations with numba


In [34]:
from numba import jit, vectorize, guvectorize, cuda, float32, void, float64

In [26]:
import math, numpy as np, matplotlib.pyplot as plt
from pandas_summary import DataFrameSummary
from scipy import ndimage

In [27]:
# Untype and Unvectorized
def proc_python(xx,yy):
    zz = np.zeros(nobs, dtype='float32')
    for j in range(nobs):   
        x, y = xx[j], yy[j] 
        x = x*2 - ( y * 55 )
        y = x + y*2         
        z = x + y + 99      
        z = z * ( z - .88 ) 
        zz[j] = z           
    return zz

In [28]:
nobs = 10000
x = np.random.randn(nobs).astype('float32')
y = np.random.randn(nobs).astype('float32')

In [29]:
%timeit proc_python(x,y)   # Untyped and unvectorized


76.2 ms ± 7.86 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

Numpy


In [30]:
def proc_numpy(x,y):
    z  = np.zeros(nobs,dtype='float32')
    x = x*2-(y*55)
    y = x+y*2
    z = x+y+99
    z = z*(z-.88)
    return z

In [31]:
np.allclose(proc_numpy(x,y),proc_python(x,y),atol=1e-4)


Out[31]:
True

In [32]:
%timeit proc_numpy(x,y)


40.9 µs ± 4.41 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)

In [35]:
@jit()
def proc_numba(xx,yy,zz):
    for j in range(nobs):   
        x, y = xx[j], yy[j] 
        x = x*2 - ( y * 55 )
        y = x + y*2         
        z = x + y + 99      
        z = z * ( z - .88 ) 
        zz[j] = z           
    return zz

In [36]:
z = np.zeros(nobs).astype('float32')
np.allclose( proc_numpy(x,y), proc_numba(x,y,z), atol=1e-4 )


Out[36]:
True

In [37]:
%timeit proc_numba(x,y,z)


7.72 µs ± 590 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)

In [38]:
@vectorize
def vec_numba(x,y):
    x = x*2 - ( y * 55 )
    y = x + y*2         
    z = x + y + 99      
    return z * ( z - .88 )

In [39]:
np.allclose(vec_numba(x,y), proc_numba(x,y,z), atol=1e-4 )


Out[39]:
True

In [40]:
%timeit vec_numba(x,y)


7.4 µs ± 630 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)

In [ ]: